Loading required libraries

library(gapminder)
library(pacman)
pacman::p_load(data.table, fixest, stargazer, dplyr, magrittr)
library("ggplot2")
library(gganimate)

Gapminder data set :

head(gapminder)
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
Y=gapminder$lifeExp
X=gapminder$pop
nrow(gapminder)
## [1] 1704

#Description:

1704 observations; fills a size niche between iris (150 rows) and the likes of diamonds (54K rows) 6 variables country a factor with 142 levels continent, a factor with 5 levels year: going from 1952 to 2007 in increments of 5 years pop: population gdpPercap: GDP per capita lifeExp: life expectancy

Transition through distinct states in time

p <- ggplot(
  gapminder, 
  aes(x = pop, y=lifeExp, size = gdpPercap, colour = country)
  ) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(x = "POP", y = "Life expectancy")
p

p + transition_time(year) +
  labs(title = "Year: {frame_time}")

Let the view follow the data in each frame

p + facet_wrap(~continent) +
  transition_time(year) +
  labs(title = "Year: {frame_time}")

Linear regression using lm function:

lrm.fit=lm(Y~X)
lrm.fit$coef
##  (Intercept)            X 
## 5.924048e+01 7.903656e-09

#Montecarlo simulation

In statistics, ordinary least squares (OLS) is a type of linear least squares method for estimating the unknown parameters in a linear regression model. OLS chooses the parameters of a linear function of a set of explanatory variables by the principle of least squares: minimizing the sum of the squares of the differences between the observed dependent variable (values of the variable being observed) in the given dataset and those predicted by the linear function of the independent variable.

#parameters 
Beta0=6#True value of intercept
Beta1=0.00005 #True value of slope 
n=1704 #Sample Size
# Pesedo code for 5000 simulation 
N=5000 #Number of replications
set.seed(1234)#to reproduce the same result
int.est=numeric(N)#empty vector to store the intercept 
slp.est=numeric(N)#empty vector to store the slope 
slope_DT=numeric(N)
intercept_DT=numeric(N)
for (i in 1:N){
  Y=Beta0+Beta1*X+rnorm(n,0,5)
  lrm.fit=lm(Y~X)
  data_i = data.table(Y = Y, X = X)
  
#store intercept for each replication
int.est[[i]]=as.vector(lrm.fit$coef[1])

#store slope for each replication
slp.est[[i]]=as.vector(lrm.fit$coef[2])

ols_i <- fixest::feols(data = data_i, Y ~ X)
# Extract slope coefficient and save
slope_DT[i] <- ols_i$coefficients[2]
intercept_DT[i] <- ols_i$coefficients[1]


}

Summary statistics using OLS

estimates_DT <- data.table(beta_1 = slope_DT, beta_0 = intercept_DT)
stargazer(estimates_DT[, c("beta_1", "beta_0")], type = "text")
## 
## ================================================================
## Statistic   N    Mean  St. Dev.   Min   Pctl(25) Pctl(75)  Max  
## ----------------------------------------------------------------
## beta_1    5,000 0.0001  0.000   0.00005 0.00005   0.0001  0.0001
## beta_0    5,000 5.998   0.127    5.500   5.912    6.082   6.454 
## ----------------------------------------------------------------